STATS506-Problem Set 5

Author

Shenyi Tang

Published

November 19, 2024

Shenyi Tang’s GitHub Repo For STATS 506 FA 2024

https://github.com/shenyi-tang/stats506-computing-methods-and-tools.git

library(roxygen2)
library(Rcpp)
library(ggplot2)
library(tidyverse)
library(plotly)
library(nycflights13)
library(data.table)

Problem 1 - OOP Programming

  1. For the rational class, define the following:
  • A constructor
  • A validator that ensures the denominator is non-zero
  • A show method
  • A simplify method, to obtain this simplest form(e.g. simplify(2/4) produces 1/2)
  • A quotient method (e.g. quotient(3/7) produces .42857143...). It should support a digits argument but only in the printing, not the return result (Hint: what does print return?).
  • Addition, subtraction, multiplication, division. These should all return a rational.
  • You’ll (probably) need GCD and LCM as part of some of these calculations; include these functions using Rcpp. Even if you don’t need these functions for another calculation, include them.
# a.1 define the rational class
##' @title define "rational" class
##' @slot numerator, an integer, the number above the line in the common fraction
##' @slot denominator, an integer, the number below the line in the common fraction
##' 
rational <- setClass("rational",
                     slots = c(numerator = "numeric",
                               denominator = "numeric"))

# a.2 a validator
##' @title set a validator for 'rational' class
##' @detail to check whether both the numerator and denominator are integers
##' @detail to check whether the denominator is non-zero
##' 
setValidity("rational", function(object){
  if (object@denominator == 0) {
    stop(paste0("@denominator = 0, is not a valid denominator"))
  }
  else if(!is.numeric(object@numerator) || !is.numeric(object@denominator)) 
    {
    stop("Both the numerator and denominator should be numeric")
  }
  else if (floor(object@numerator) != object@numerator || !is.numeric(object@numerator)) {
    stop(paste0("@numerator = ", object@numerator, " is not a valid numerator"))
  }
  else if (floor(object@denominator) != object@denominator || !is.numeric(object@denominator)) {
    stop(paste0("@denominator = ", object@denominator, " is not a valid denominator"))
  }
  return(TRUE)
})

# a.3 a show method
##' @title show the rational in a fraction style
##' 
setMethod("show", "rational",
          function(object){
            cat(object@numerator, "/", object@denominator, "\n", sep = "")
          })

# a.4 a simplify method
# GCD - Greatest Common Divisor
# LCM - Least Common Multiple
cppFunction("
  int gcd(int a, int b){
    while(b != 0){
      int temp = b;
      b = a % b;
      a = temp;
    }
    return a;
  }
  
  int lcm(int a, int b){
    return a * b / gcd(a, b);
  }   
            ")
##' @title create a method to simplify the fraction
##' @param object, with a "rational" class
##' @detail divide the numerator and denominator respectively by their GCD
##'
setGeneric("simplify", 
           function(object){
             standardGeneric("simplify")
           })
Creating a new generic function for 'simplify' in the global environment
setMethod("simplify", "rational",
          function(object){
            g <- gcd(abs(object@numerator), abs(object@denominator))
            simplified_numerator <- object@numerator / g
            simplified_denominator <- object@denominator / g
            rational(numerator = simplified_numerator, 
                     denominator = simplified_denominator)
          })

# a.5 a quotient method
##' @title calculate the quotient of the fraction
##' @param object, with a "rational" class
##' @param digits, default value is null
##' 
setGeneric("quotient",
           function(object, digits = NULL){
             standardGeneric("quotient")
           })

setMethod("quotient", "rational",
          function(object, digits = NULL){
            q <- as.numeric(object@numerator) / as.numeric(object@denominator)
            if (!is.null(digits)) {
              if (!is.numeric(digits) || digits != floor(digits)){
                stop("Digits should be an integer")
              }
              # use "format" to control the output of the result 
              formatted_q <- format(round(q, digits), nsmall = digits, scientific = FALSE)
              print(formatted_q)
            }
            else {
              print(format(q, scientific = FALSE))
            }
            return(invisible(q))
          })
           
# a.6 Operations: Addition, subtraction, multiplication, and division
# Addition
setMethod("+", signature(e1 = "rational", 
                         e2 = "rational"),
          function(e1, e2){
            new_numerator <- e1@numerator * e2@denominator + e2@numerator * e1@denominator
            new_denominator <- e1@denominator * e2@denominator
            simplify(rational(numerator = new_numerator,
                              denominator = new_denominator))
          })

# subtraction
setMethod("-", signature(e1 = "rational",
                         e2 = "rational"),
          function(e1, e2){
            new_numerator <- e1@numerator * e2@denominator - e2@numerator * e1@denominator
            new_denominator <- e1@denominator * e2@denominator
            simplify(rational(numerator = new_numerator,
                              denominator = new_denominator))
          })

# multiplication
setMethod("*", signature(e1 = "rational", 
                         e2 = "rational"),
          function(e1, e2){
            new_numerator <- e1@numerator * e2@numerator
            new_denominator <- e1@denominator * e2@denominator
            simplify(rational(numerator = new_numerator,
                              denominator = new_denominator))
          })

# division
setMethod("/", signature(e1 = "rational",
                         e2 = "rational"),
          function(e1, e2){
            if (e2@numerator == 0){
              stop("Divisor cannot be zero")
            }
            new_numerator <- e1@numerator * e2@denominator
            new_denominator <- e1@denominator * e2@numerator
            simplify(rational(numerator = new_numerator,
                              denominator = new_denominator))
          })
  1. Use your rational class to create 3 objects
  • r1: \(\frac{24}{6}\)
  • r2: \(\frac{7}{230}\)
  • r3: \(\frac{0}{4}\)
# create 3 rational objects
r1 <- rational(numerator = 24, denominator = 6)
r2 <- rational(numerator = 7, denominator = 230)
r3 <- rational(numerator = 0, denominator = 4)

# test cases
r1
24/6
r3
0/4
r1 + r2
927/230
r1 - r2
913/230
r1 * r2
14/115
r1 / r2
920/7
r1 + r3
4/1
r1 * r3
0/1
r2 / r3
Error in r2/r3: Divisor cannot be zero
quotient(r1)
[1] "4"
quotient(r2)
[1] "0.03043478"
quotient(r2, digits = 3.14)
Error in quotient(r2, digits = 3.14): Digits should be an integer
quotient(r2, digits = "avocado")
Error in quotient(r2, digits = "avocado"): Digits should be an integer
q2 <- quotient(r2, digits = 3)
[1] "0.030"
q2
[1] 0.03043478
quotient(r3)
[1] "0"
simplify(r1)
4/1
simplify(r2)
7/230
simplify(r3)
0/1
  1. Show that your validator does not allow the creation of rational’s with 0 denominator, and check other malformed input to your constructor.
rational(numerator = 2, denominator = 0)
Error in validityMethod(object): @denominator = 0, is not a valid denominator
rational(numerator = "n", denominator = 2)
Error in validObject(.Object): invalid class "rational" object: invalid object for slot "numerator" in class "rational": got class "character", should be or extend class "numeric"
rational(numerator = "3", denominator = 4)
Error in validObject(.Object): invalid class "rational" object: invalid object for slot "numerator" in class "rational": got class "character", should be or extend class "numeric"

Problem 2 - plotly

df <- read.csv("df_for_ml_improved_new_market.csv")
  1. Regenerate your plot which addresses the second question from last time:
  • ii Does the distribution of genre of sales across years appear to change?
# transfer the origin data set to longer table
# remove the prefix "Genre___"
pivot_df <- df %>% 
  tidyr::pivot_longer(
    cols = c(Genre___Photography, Genre___Print, Genre___Sculpture, Genre___Painting, Genre___Others),
    names_to = 'genre',
    values_to = 'if.genre'
  ) %>% 
  filter(if.genre == 1) %>% 
  mutate(genre = gsub(".*___", "", genre))


pivot_df %>%
  group_by(year, genre) %>%
  summarise(count = n()) %>%
  plot_ly(x = ~year, y = ~count, color = ~genre, type = 'bar', 
          colors = 'Pastel1', 
          text = ~paste("Year:", year, "<br>Genre:", genre, "<br>Count:", count),
          hoverinfo = 'text') %>%
  layout(title = "Distribution of Genre of Sales across Years",
         xaxis = list(title = "Year"),
         yaxis = list(title = "Number of Sales"),
         barmode = 'stack')
  • Generally, sales of each genre increase over year. By 2011, the number of sales are much more higher than other years.
  1. Generate an interactive plot with plotly that can address both of these questions from last time:
# Calculate average price for each year and genre
genre_price_trends <- pivot_df %>%
  group_by(year, genre) %>%
  summarise(avg_price = mean(price_usd, na.rm = TRUE), .groups = 'drop')

# Calculate overall average price per year
overall_price_trend <- pivot_df %>%
  group_by(year) %>%
  summarise(avg_price = mean(price_usd, na.rm = TRUE), .groups = 'drop') %>%
  mutate(genre = "Overall Average")

# Combine the two datasets for a single plot
combined_trends <- bind_rows(
  genre_price_trends,
  overall_price_trend
)

# Create an interactive plot
plot <- plot_ly(data = combined_trends, x = ~year, y = ~avg_price, color = ~genre, 
                type = 'scatter', mode = 'lines+markers',
                text = ~paste("Year:", year, "<br>Price:", scales::dollar(avg_price))) %>%
  layout(
    title = list(text = "Change in Sales Price by Genre and Year",
                 x = 0.5,
                 font = list(size = 18, face = "bold")),
    xaxis = list(title = "Year"),
    yaxis = list(title = "Average Sales Price (USD)", tickformat = "$"),
    legend = list(title = list(text = "Genre"))
  )

# Render the plot
plot
  • According to the plot above, photography and prints showed the most dramatic price changes, while sculpture maintained the most stable pricing over time.

Problem 3 - data.table

data("flights")
flights <- setDT(flights)
data("airports")
airports <- setDT(airports)
data("planes")
planes <- setDT(planes)
  1. Generate a table reporting the mean and median departure delay per airport. Generate a second table reporting the mean and median arrival delay per airport. Exclude any destination with under 10 flights.
  • Order both tables in descending mean delay
  • Both tables should use the airport names not the airport codes
  • Both tables should print all rows
# control the total width of the data.tables in general
options(width = 100)

# merge flights and airports to get the name of airports
# add departure airports name
flights <- flights[airports[, .(faa, name)],
              on = .(origin = faa),
              nomatch = 0] |> # join
  _[, dept_name := name] |> # add departure airports name
  _[, name := NULL] # drop the name column (from airports)

# add destination airports name
flights <- flights[airports[, .(faa, name)],
                   on = .(dest = faa),
                   nomatch = 0] |> # join
  _[, arr_name := name] |> # add arrival airports name
  _[, name := NULL] # drop the name column (from airports)

# table 1: calculate the departure delay
depart_delay <- flights[, .(mean_dept_delay = mean(dep_delay, na.rm = TRUE),
                            median_dept_delay = median(dep_delay, na.rm = TRUE),
                            flight_cnt = .N),
                        by = dept_name] |> # statistics
  _[flight_cnt >= 10] |> # filter out
  _[order(-mean_dept_delay)] # descending order
print(depart_delay, nrows = .N)
             dept_name mean_dept_delay median_dept_delay flight_cnt
                <char>           <num>             <num>      <int>
1: Newark Liberty Intl        15.17009                -1     119282
2: John F Kennedy Intl        12.25837                -1     105230
3:          La Guardia        10.34688                -3     104662
# # table 2: calculate the arrival delay
arrival_delay <- flights[, .(mean_arr_delay = mean(arr_delay, na.rm = TRUE),
                             median_dept_delay = median(arr_delay, na.rm = TRUE),
                             flight_cnt = .N),
                         by = arr_name] |> # statistics
  _[flight_cnt >= 10] |> # filter out
  _[order(-mean_arr_delay)] # descending order
print(arrival_delay, nrows = .N)
                                arr_name mean_arr_delay median_dept_delay flight_cnt
                                  <char>          <num>             <num>      <int>
 1:                Columbia Metropolitan    41.76415094              28.0        116
 2:                           Tulsa Intl    33.65986395              14.0        315
 3:                    Will Rogers World    30.61904762              16.0        346
 4:                 Jackson Hole Airport    28.09523810              15.0         25
 5:                        Mc Ghee Tyson    24.06920415               2.0        631
 6:               Dane Co Rgnl Truax Fld    20.19604317               1.0        572
 7:                        Richmond Intl    20.11125320               1.0       2454
 8:        Akron Canton Regional Airport    19.69833729               3.0        864
 9:                      Des Moines Intl    19.00573614               0.0        569
10:                   Gerald R Ford Intl    18.18956044               1.0        765
11:                      Birmingham Intl    16.87732342              -2.0        297
12:         Theodore Francis Green State    16.23463687               1.0        376
13: Greenville-Spartanburg International    15.93544304              -0.5        849
14:    Cincinnati Northern Kentucky Intl    15.36456376              -3.0       3941
15:            Savannah Hilton Head Intl    15.12950601              -1.0        804
16:          Manchester Regional Airport    14.78755365              -3.0       1009
17:                          Eppley Afld    14.69889841              -2.0        849
18:                               Yeager    14.67164179              -1.5        138
19:                     Kansas City Intl    14.51405836               0.0       2008
20:                          Albany Intl    14.39712919              -4.0        439
21:                General Mitchell Intl    14.16722038               0.0       2802
22:                       Piedmont Triad    14.11260054              -2.0       1606
23:               Washington Dulles Intl    13.86420212              -3.0       5700
24:               Cherry Capital Airport    12.96842105             -10.0        101
25:              James M Cox Dayton Intl    12.68048606              -3.0       1525
26:     Louisville International Airport    12.66938406              -2.0       1157
27:                  Chicago Midway Intl    12.36422360              -1.0       4113
28:                      Sacramento Intl    12.10992908               4.0        284
29:                    Jacksonville Intl    11.84483416              -2.0       2720
30:                       Nashville Intl    11.81245891              -2.0       6333
31:                Portland Intl Jetport    11.66040210              -4.0       2352
32:               Greater Rochester Intl    11.56064461              -5.0       2416
33:      Hartsfield Jackson Atlanta Intl    11.30011285              -1.0      17215
34:                Lambert St Louis Intl    11.07846451              -3.0       4339
35:                         Norfolk Intl    10.94909344              -4.0       1536
36:            Baltimore Washington Intl    10.72673385              -5.0       1781
37:                         Memphis Intl    10.64531435              -2.5       1789
38:                   Port Columbus Intl    10.60132291              -3.0       3524
39:                  Charleston Afb Intl    10.59296847              -4.0       2884
40:                    Philadelphia Intl    10.12719014              -3.0       1632
41:                  Raleigh Durham Intl    10.05238095              -3.0       8163
42:                    Indianapolis Intl     9.94043412              -3.0       2077
43:            Charlottesville-Albemarle     9.50000000              -5.0         52
44:               Cleveland Hopkins Intl     9.18161129              -5.0       4573
45:        Ronald Reagan Washington Natl     9.06695204              -2.0       9705
46:                      Burlington Intl     8.95099602              -4.0       2589
47:                 Buffalo Niagara Intl     8.94595186              -5.0       4681
48:                Syracuse Hancock Intl     8.90392501              -5.0       1761
49:                          Denver Intl     8.60650021              -2.0       7266
50:                      Palm Beach Intl     8.56297210              -3.0       6554
51:                             Bob Hope     8.17567568              -3.0        371
52:       Fort Lauderdale Hollywood Intl     8.08212154              -3.0      12055
53:                          Bangor Intl     8.02793296              -9.0        375
54:           Asheville Regional Airport     8.00383142              -1.0        275
55:                      Pittsburgh Intl     7.68099053              -5.0       2875
56:                       Gallatin Field     7.60000000              -2.0         36
57:                 NW Arkansas Regional     7.46572581              -2.0       1036
58:                           Tampa Intl     7.40852503              -4.0       7466
59:               Charlotte Douglas Intl     7.36031885              -3.0      14064
60:             Minneapolis St Paul Intl     7.27016886              -5.0       7185
61:                      William P Hobby     7.17618819              -4.0       2115
62:                         Bradley Intl     7.04854369             -10.0        443
63:                     San Antonio Intl     6.94537178              -9.0        686
64:                      South Bend Rgnl     6.50000000              -3.5         10
65:     Louis Armstrong New Orleans Intl     6.49017497              -6.0       3799
66:                        Key West Intl     6.35294118               7.0         17
67:                        Eagle Co Rgnl     6.30434783              -4.0        213
68:                Austin Bergstrom Intl     6.01990875              -5.0       2439
69:                   Chicago Ohare Intl     5.87661475              -8.0      17283
70:                         Orlando Intl     5.45464309              -5.0      14082
71:               Detroit Metro Wayne Co     5.42996346              -7.0       9384
72:                        Portland Intl     5.14157973              -5.0       1354
73:                        Nantucket Mem     4.85227273              -3.0        265
74:                      Wilmington Intl     4.63551402              -7.0        110
75:                    Myrtle Beach Intl     4.60344828             -13.0         59
76:    Albuquerque International Sunport     4.38188976              -5.5        254
77:         George Bush Intercontinental     4.24079040              -5.0       7198
78:        Norman Y Mineta San Jose Intl     3.44817073              -7.0        329
79:               Southwest Florida Intl     3.23814963              -5.0       3537
80:                       San Diego Intl     3.13916574              -5.0       2737
81:              Sarasota Bradenton Intl     3.08243131              -5.0       1211
82:            Metropolitan Oakland Intl     3.07766990              -9.0        312
83:   General Edward Lawrence Logan Intl     2.91439222              -9.0      15508
84:                   San Francisco Intl     2.67289152              -8.0      13331
85:                         Yampa Valley     2.14285714               2.0         15
86:              Phoenix Sky Harbor Intl     2.09704733              -6.0       4656
87:            Montrose Regional Airport     1.78571429             -10.5         15
88:                     Los Angeles Intl     0.54711094              -7.0      16174
89:               Dallas Fort Worth Intl     0.32212685              -9.0       8738
90:                           Miami Intl     0.29905978              -9.0      11728
91:                       Mc Carran Intl     0.25772849              -8.0       5997
92:                  Salt Lake City Intl     0.17625459              -8.0       2467
93:                           Long Beach    -0.06202723             -10.0        668
94:                Martha\\\\'s Vineyard    -0.28571429             -11.0        221
95:                  Seattle Tacoma Intl    -1.09909910             -11.0       3923
96:                        Honolulu Intl    -1.36519258              -7.0        707
97:            John Wayne Arpt Orange Co    -7.86822660             -11.0        825
98:                    Palm Springs Intl   -12.72222222             -13.5         19
                                arr_name mean_arr_delay median_dept_delay flight_cnt
  1. How many flights did the aircraft model with the fastest average speed take? Produce a tibble with 1 row, and entries for the model, average speed (in MPH) and number of flights.
# join flights and planes
new_flights <- flights[, flight_speed := distance / (air_time / 60)] |>
  _[planes, on = "tailnum", nomatch = 0]

# filter out the fastest model
fastest <- new_flights[, avg_speed := mean(flight_speed, na.rm = TRUE),
                       by = model] |>
  _[order(-avg_speed)] |>
  _[1, model]

# detailed information of fastest model
fastest_information <- new_flights[model == fastest,
                               .(avg_speed = mean(flight_speed, na.rm = TRUE),
                                 flights_cnt = .N),
                               by = model]
print(fastest_information)
     model avg_speed flights_cnt
    <char>     <num>       <int>
1: 777-222  482.6254           4